/*******************************************************************************

This file imports the raw CHED data and cleans it;

*******************************************************************************/

capture log close
capture program drop _all
capture macro drop _all
drop _all
set more off

*SET PATH NAMES;
global raw_data 
global stata_data
global geo_data 
global other_data 

********************************
*I.IMPORTING THE DATA TO FOLDER*
********************************

******2005-2012*****************************************************************

foreach x in 0506 0607 0708 0809 0910 1011 1112 {
	insheet using "${raw_data}/ched_`x'.csv", clear
	rename g_total gmftotal
	drop six_digitprogdis
	rename normalized6d_progdis six_digitprogdis
	tostring six_digitprogdis, replace
	tostring pscedgroup, replace
	gen typedata="Both"
	gen insttype="private" if instype=="PN" | instype=="PS"
	replace insttype="public" if insttype==""
	save "${stata_data}/ched_`x'.dta", replace
}

foreach x in 0506 0607 0708 0809 0910 1011 1112 {
	use "${stata_data}/ched_`x'.dta", clear
	replace province = subinstr(province, char(241), "n", .)					//replacing weird strings in province: Las Pi�as City, Para�aque City
	merge m:1 province using "${geo_data}provmatch1.dta"
	replace province=province_match if _m==3 & province!="Metro Manila"
	drop if _m==2
	drop province_match _m
		
	keep regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
	order regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
	save "${stata_data}/ched_`x'.dta", replace
}


******2012/2013 and 2013/2014***************************************************
* These have different naming structure so doing it outside of the loop

import excel "${raw_data}/AY 2012_13 to AY 2013_14 EnrolGrads DB as of August 29 2014xls.xlsx", sheet("2012_13") firstrow clear
rename g_total gmftotal

replace province = subinstr(province, char(241), "n", .) 						//replacing weird strings in province: Las Pi�as City, Para�aque City
merge m:1 province using "${geo_data}provmatch1.dta"
replace province=province_match if _m==3 & province!="Metro Manila"
drop if _m==2
drop province_match _merge

gen typedata="Both"

gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
save "${stata_data}/ched_1213.dta", replace


import excel "${raw_data}/AY 2012_13 to AY 2013_14 EnrolGrads DB as of August 29 2014xls.xlsx", sheet("2013_14") firstrow clear
rename g_total gmftotal

replace province = subinstr(province, char(241), "n", .) 						//replacing weird strings in province: Las Pi�as City, Para�aque City
merge m:1 province using "${geo_data}provmatch1.dta"
replace province=province_match if _m==3 & province!="Metro Manila"
drop if _m==2
drop province_match _merge

gen typedata="Both"

gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedname pscedgroup disciplinegroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
save "${stata_data}/ched_1314.dta", replace


******2003/2004*****************************************************************
import excel "${raw_data}/CHED enroll and grad 2003-04.xls", sheet("Clean") firstrow clear
drop if acadyear==""

replace province = subinstr(province, char(241), "n", .) 						//replacing weird strings in province: Las Pi�as City, Para�aque City
merge m:1 province using "${geo_data}provmatch1.dta"
replace province=province_match if _m==3 & province!="Metro Manila"
drop if _m==2
drop province_match _merge

replace citymunicipality = subinstr(citymunicipality, char(241), "n", .) if province=="Metro Manila" //replacing weird strings in province: Las Pi�as City, Para�aque City
gen citymunicipality_temp= province+citymunicipality if province=="Metro Manila"
merge m:1 citymunicipality_temp using "${geo_data}provmatch2.dta"
replace province=province_match2 if _m==3 & province=="Metro Manila"
drop if _m==2
drop province_match _merge citymunicipality_temp

gen typedata="Both"

gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode instname insttype province citymunicipality six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
save "${stata_data}/ched_0304.dta", replace


******2004/2005*****************************************************************
* This file is missing geographical information
*Fill in from the two years surrounding it
*Do this for any years missing geographical information (municipality, province)

import excel "${raw_data}/CHED enroll and grad 2004-05.xls", sheet("Clean") firstrow clear
rename enftotal emftotal
drop if acadyear==""

preserve

use "${stata_data}ched_0506.dta", clear
contract instcode province
gen length=length(instcode)
replace instcode="0" + instcode if length==4
drop length _freq
save "${stata_data}geo_temp05.dta", replace

use "${stata_data}ched_0304.dta", clear
contract instcode province
drop if instcode==""
drop if instcode=="13155" | instcode=="13056" 									//This observation is repeated twice
drop _freq
save "${stata_data}geo_temp03.dta", replace

restore

merge m:1 instcode using "${stata_data}geo_temp05.dta"
drop if _m==2
merge m:1 instcode using "${stata_data}geo_temp03.dta", update gen(_m2)
drop if _m2==2
drop _merge _m2

gen typedata="Both"

gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""
replace insttype="" if instype==""

keep regioncode acadyear instcode insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
erase "${stata_data}geo_temp05.dta"
erase "${stata_data}geo_temp03.dta"
save "${stata_data}/ched_0405.dta", replace


******2002/2003*****************************************************************

use "${other_data}CHED_2002-03.dta", clear
gen acadyear="2002/03"

replace province = subinstr(province, char(241), "n", .) 						//replacing weird strings in province: Las Pi�as City, Para�aque City
merge m:1 province using "${geo_data}provmatch1.dta"
replace province=province_match if _m==3 & province!="Metro Manila"
drop if _m==2
drop if _m==1																	
drop province_match _merge

replace citymunicipality = subinstr(citymunicipality, char(241), "n", .) if province=="Metro Manila" //replacing weird strings in province: Las Pi�as City, Para�aque City
gen citymunicipality_temp= province+citymunicipality if province=="Metro Manila"
merge m:1 citymunicipality_temp using "${geo_data}provmatch2.dta"
replace province=province_match2 if _m==3 & province=="Metro Manila"
drop if _m==2
drop province_match _merge citymunicipality_temp

gen typedata="Both"
gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode insttype instname province citymunicipality six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode insttype instname province citymunicipality six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
save "${stata_data}/ched_0203.dta", replace


******2001/2002*****************************************************************

use "${other_data}CHED_2001-02.dta", clear
gen acadyear="2001/02"
rename citymunicipality province												

replace province = subinstr(province, char(241), "n", .) 						//replacing weird strings in province: Las Pi�as City, Para�aque City
merge m:1 province using "${geo_data}provmatch1.dta"
replace province=province_match if _m==3 & province!="Metro Manila"
drop if _m==2
drop province_match _merge

gen typedata="Both"
gen insttype="private" if instype=="PN" | instype=="PS"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
order regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal gmtotal gftotal gmftotal
save "${stata_data}/ched_0102.dta", replace


******1998/1999-2000/2001 ENROLLMENTS*******************************************

import excel "${raw_data}/Updated Enrolment 1998-99 to 2000-01.xlsx", firstrow clear

rename INSTCOD instcode
rename REGION regioncode
rename PROGDIS six_digitprogdis
rename DISC pscedgroup

gen acadyear="1998/99" if YEAR==1998
replace acadyear="1999/00" if YEAR==1999
replace acadyear="2000/01" if YEAR==2000

/*Missing geographic information is filled in from previous and following years*/

preserve

use "${stata_data}ched_0102.dta", clear
contract instcode province
drop if instcode=="13073" & province==""										//duplicate observation
drop _freq
save "${stata_data}geo_temp01.dta", replace

restore

***

merge m:1 instcode using "${stata_data}geo_temp01.dta"							
drop if _m==2
drop _m

collapse (sum) ENROLMENT, by(regioncode instcode SCHTYPE SCHSTYPE2 PROGNAM CRSE_CODE pscedgroup OECDClass CHEDClass ISCEDLev six_digitprogdis PROGLEV STUDYER STUDGEN YEAR acadyear province)
reshape wide ENROLMENT, i(regioncode instcode CRSE_CODE PROGNAM six_digitprogdis CHEDClass PROGLEV acadyear province STUDYER) j(STUDGEN) string
reshape wide ENROLMENTF ENROLMENTM, i(regioncode instcode CRSE_CODE PROGNAM six_digitprogdis CHEDClass PROGLEV acadyear province) j(STUDYER)
egen emtotal=rowtotal(ENROLMENTM*)
egen eftotal=rowtotal(ENROLMENTF*)
egen emftotal=rowtotal(ENROLMENT*)
drop ENROLMENT*

gen typedata="Enrollments"

gen insttype="private" if SCHTYPE=="P"
replace insttype="public" if SCHTYPE=="G"

keep regioncode acadyear instcode insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
order regioncode acadyear instcode insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
erase "${stata_data}geo_temp01.dta"
save "${stata_data}ched_9801e.dta", replace


******1998/1999-2000/2001 GRADUATIONS*******************************************

import excel "${raw_data}/New Graduates 1998-2000.xlsx", firstrow clear case(lower)

rename instcod instcode
rename region regioncode
rename instnam instname
rename progdis six_digitprogdis
rename disc pscedgroup

gen acadyear="1998/99" if year==1998
replace acadyear="1999/00" if year==1999
replace acadyear="2000/01" if year==2000

preserve

use "${stata_data}ched_0102.dta", clear
contract instcode province
drop if instcode=="13073" & province==""										//duplicate observation
drop _freq
save "${stata_data}geo_temp01.dta", replace

restore

***

merge m:1 instcode using "${stata_data}geo_temp01.dta"							
drop if _m==2
drop _m

collapse (sum) graduates, by(regioncode instcode instname prognam oecdclass chedclass pscedgroup six_digitprogdis iscedlev proglev studgen year type class acadyear province)
reshape wide graduates, i(instcode instname prognam oecdclass chedclass pscedgroup six_digitprogdis iscedlev acadyear province) j(studgen)
rename graduates1 gmtotal
rename graduates2 gftotal
replace gmtotal=0 if gmtotal==.
replace gftotal=0 if gftotal==.
gen gmftotal=gmtotal+gftotal

gen typedata="Graduations"

keep regioncode acadyear instcode instname province six_digitprogdis pscedgroup typedata gmtotal gftotal gmftotal
order regioncode acadyear instcode instname province six_digitprogdis pscedgroup typedata gmtotal gftotal gmftotal
erase "${stata_data}geo_temp01.dta" 
save "${stata_data}ched_9801g.dta", replace


******1996/1997 ENROLLMENTS*****************************************************

insheet using "${raw_data}/EN9697.csv", clear

rename schcode instcode
rename region regioncode
rename schname instname
rename em emtotal
rename ef eftotal
rename eboth emftotal
rename progdis six_digitprogdis
rename discode pscedgroup


gen acadyear="1996/97"

merge m:1 schcity using "${geo_data}provmatch3.dta"								
replace schcity=province_match if _m==3 & schcity!="METRO MANILA"
drop if _m==2
drop _m province_match

replace schdist = subinstr(schdist, char(209), "N", .)	if schcity=="METRO MANILA"	//removing ~letters to facilitate matching 
merge m:1 schdist schcity using "${geo_data}provmatch4.dta"
replace schcity=province_match2 if _m==3
drop if _m==2
drop _m province_match2

rename schcity province

save "${stata_data}chedtemp96.dta", replace										

gen typedata="Enrollments"

gen insttype="private" if schtype=="P"
replace insttype="public" if schtype=="G"

keep regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
order regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
save "${stata_data}ched_9697e.dta", replace


******1996/1997 GRADUATIONS*****************************************************

import excel "${raw_data}CHED grad 1994-95, 1996-97, 1997-98.xls", sheet("Clean_9697") firstrow clear case(lower)

rename six_digitprogdisc six_digitprogdis

preserve

use "${stata_data}ched_9697e.dta", clear

contract instcode province
drop _freq
tostring instcode, replace
gen length=length(instcode)
replace instcode="0"+instcode if length==4
drop length
save "${stata_data}geo_temp96.dta", replace

restore

***

merge m:1 instcode using "${stata_data}geo_temp96.dta"							
drop if _m==2
drop _m

gen typedata="Graduations (not by gender)"

keep regioncode acadyear instcode instname province six_digitprogdis pscedgroup typedata gmftotal
order regioncode acadyear instcode instname province six_digitprogdis pscedgroup typedata gmftotal
erase "${stata_data}geo_temp96.dta"
save "${stata_data}ched_9697g.dta", replace


******1997/1998 ENROLLMENTS*****************************************************

import excel "${raw_data}CHED enroll 1997-98.xls", firstrow case(l) clear

rename instcod instcode
drop firstofinstcod 
rename region regioncode
rename schname instname


gen acadyear="1997/98"															


preserve

use "${stata_data}ched_0102.dta", clear
contract instcode province instname
drop if instcode=="13073" & province==""										//duplicate observation
duplicates drop instcode, force
drop _freq
save "${stata_data}geo_temp01.dta", replace

restore

***

merge m:1 instcode using "${stata_data}geo_temp01.dta"
drop if _m==2
drop _m


preserve

use "${stata_data}ched_0102.dta", clear
contract instcode insttype
duplicates drop instcode, force
drop _freq
save "${stata_data}geo_temp02.dta", replace

restore

merge m:1 instcode using "${stata_data}geo_temp02.dta"
drop if _m==2
drop _m


preserve

use "${stata_data}chedtemp96.dta", clear
contract crse_code six_digitprogdis												
drop _freq
save "${stata_data}progcode_temp96.dta", replace

restore

***

merge m:1 crse_code using "${stata_data}progcode_temp96.dta"					
drop if _m==2
drop _m

preserve

use "${stata_data}chedtemp96.dta", clear
contract crse_code pscedgroup
drop _freq
save "${stata_data}pscedcode_temp96.dta", replace

restore

***

merge m:1 crse_code using "${stata_data}pscedcode_temp96.dta"					
drop _m

rename sumofenrolment emftotal

gen typedata="Enrollments (not by gender)"

keep regioncode acadyear instcode insttype instname province six_digitprogdis typedata emftotal pscedgroup
order regioncode acadyear instcode insttype instname province six_digitprogdis typedata emftotal pscedgroup
erase "${stata_data}geo_temp01.dta"
erase "${stata_data}geo_temp02.dta"
erase "${stata_data}progcode_temp96.dta"
erase "${stata_data}chedtemp96.dta"
erase "${stata_data}pscedcode_temp96.dta"
save "${stata_data}ched_9798e.dta", replace


******1997/1998 GRADUATIONS	-- NO DATA******************************************


******1995/1996 ENROLLMENTS*****************************************************

import excel "${raw_data}CHED enroll 1995-96.xls", firstrow case(lower) clear

rename schcode instcode
rename region regioncode
rename schname instname
rename progdis six_digitprogdis
rename discode pscedgroup


gen acadyear="1995/96"

merge m:1 schcity using "${geo_data}provmatch5.dta"								
replace schcity=province_match if _m==3 & schcity!="METRO MANILA"
drop if _m==2
drop _m province_match

replace schdist = subinstr(schdist, uchar(209), "N", .)							
merge m:1 schdist schcity using "${geo_data}provmatch6.dta"
replace schcity=province_match2 if _m==3
drop if _m==2
drop _m province_match2

rename schcity province

rename etotal emftotal
egen emtotal=rowtotal(e_1m e_2m e_3m e_4m e_5m e_6m)
egen eftotal=rowtotal(e_1f e_2f e_3f e_4f e_5f e_6f)
drop e_*

save "${stata_data}chedtemp95.dta", replace										

gen typedata="Enrollments"

gen insttype="private" if schstype2=="PS" | schstype2=="PN"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
order regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
save "${stata_data}ched_9596e.dta", replace

******1994/1995 ENROLLMENTS*****************************************************

insheet using "${raw_data}/EN9495.csv", clear

rename schcode instcode
rename region regioncode
rename schname instname 
rename etotmale emtotal
rename etotfema eftotal
rename etotal emftotal


gen acadyear="1994/95"

merge m:1 schcity using "${geo_data}provmatch7.dta"								
replace schcity=province_match if _m==3 & schcity!="METRO MANILA"
drop if _m==2
drop _m province_match



preserve

use "${stata_data}ched_9596e.dta", clear
contract instcode province
drop _f
save "${stata_data}geo_temp95.dta", replace

restore

***

tostring instcode, replace
gen length=length(instcode)
replace instcode="0"+instcode if length==4
drop length

merge m:1 instcode using "${stata_data}geo_temp95.dta"
replace schcity=province if _m==3 & schcity=="METRO MANILA"
drop if _m==2
drop _m province

rename schcity province

replace province="THIRD DISTRICT" if instname=="GREGORIO ARANETA UNIVERSITY FOUNDATION"	


preserve

use "${stata_data}chedtemp95.dta", clear	
contract crse_code six_digitprogdis												
drop _freq
save "${stata_data}progcode_temp95.dta", replace

restore

***

merge m:1 crse_code using "${stata_data}progcode_temp95.dta"					
drop if _m==2
drop _m

preserve

use "${stata_data}chedtemp95.dta", clear	
contract crse_code pscedgroup												
drop _freq
save "${stata_data}pscedgroup_temp95.dta", replace

restore

***

merge m:1 crse_code using "${stata_data}pscedgroup_temp95.dta"					
drop if _m==2
drop _m

gen typedata="Enrollments"

gen insttype="private" if schtype=="P"
replace insttype="public" if insttype==""

keep regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
order regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
erase "${stata_data}progcode_temp95.dta"
erase "${stata_data}geo_temp95.dta"
erase "${stata_data}pscedgroup_temp95.dta"
save "${stata_data}ched_9495e.dta", replace



******1990/1991-1993/1994 ENROLLMENTS*******************************************

insheet using "${raw_data}En1990-94BHEData.csv", clear

rename schcode instcode 
rename region regioncode
rename schname instname
rename discode pscedgroup
rename progdis six_digitprogdis

collapse (sum) sy9091 sy9192 sy9293 sy9394, by(regioncode instcode instname schtype schcity address sex crse_code six_digitprogdis pscedgroup) 
reshape long sy, i(regioncode instcode instname schtype schcity address sex crse_code six_digitprogdis pscedgroup) j(acadyear)
reshape wide sy, i(regioncode instcode instname schtype schcity address acadyear crse_code six_digitprogdis pscedgroup) j(sex) string

tostring acadyear, replace
replace acadyear="1990/91" if acadyear=="9091"
replace acadyear="1991/92" if acadyear=="9192"
replace acadyear="1992/93" if acadyear=="9293"
replace acadyear="1993/94" if acadyear=="9394"

//gets the province from the address
gen comlocation=strrpos(address,",") if address!=""								
gen temploc=substr(address, comlocation, .)										
replace temploc=regexr(temploc, ",", "")
replace temploc=ltrim(temploc)
replace temploc=address if comlocation==0
replace schcity=temploc if schcity=="" & temploc!=""
drop comlocation temploc

replace schcity = subinstr(schcity, char(209), "N", .)							
merge m:1 schcity using "${geo_data}provmatch8.dta"								
replace schcity=province_match if _m==3 & schcity!="METRO MANILA"
drop if _m==2
drop _m province_match



preserve

contract instcode instname schcity address
drop _freq
duplicates tag instname, gen(tag)												
keep if tag>0																	

gen missing=1 if schcity=="" 
bys instname: egen missinggrp=max(missing)
keep if missinggrp==1															

duplicates tag instcode, gen(tag2)
drop if tag>1 & tag2==0 														

drop if missing==1
drop tag tag2 missing missinggrp
bys instname: gen number=_n
drop if number==2																

keep instcode instname schcity
rename schcity province_match
save "${stata_data}geo_temp.dta", replace

restore
***

merge m:1 instcode instname using "${stata_data}geo_temp.dta"
replace schcity=province_match if schcity=="" & _m==3
drop if _m==2
drop _m province_match

preserve

use "${stata_data}ched_9495e.dta", clear
contract province instname
drop _freq
drop if province==""
duplicates drop instname, force
save "${stata_data}geo_temp94.dta", replace

use "${stata_data}ched_9596e.dta", clear
contract instname province
drop if province==""
duplicates tag instname, gen(tag)
drop if tag>0
drop _f tag
save "${stata_data}geo_temp95.dta", replace

restore
***

merge m:1 instname using "${stata_data}geo_temp94.dta"							
replace schcity=province if (schcity=="" | schcity=="METRO MANILA") & _m==3		
drop if _m==2
drop _m province 

merge m:1 instname using "${stata_data}geo_temp95.dta"
replace schcity=province if schcity=="" & _m==3	
drop if _m==2
drop _m province

merge m:1 instname using "${geo_data}provmatch9.dta"
replace schcity=province_match if schcity=="METRO MANILA" & _m==3
drop if _m==2
drop _m province_match

merge m:1 instname using "${geo_data}provmatch10.dta"							
replace schcity=province_match if schcity=="" & _m==3
drop if _m==2
drop _m province_match

rename schcity province

rename syF eftotal
rename syM emtotal
replace eftotal=0 if eftotal==.
replace emtotal=0 if emtotal==.
gen emftotal=eftotal+emtotal

gen typedata="Enrollments"

gen insttype="private" if schtype=="P"
replace insttype="public" if schtype=="G"

keep regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
order regioncode acadyear instcode instname insttype province six_digitprogdis pscedgroup typedata emtotal eftotal emftotal
erase "${stata_data}geo_temp.dta"
erase "${stata_data}geo_temp94.dta"
erase "${stata_data}geo_temp95.dta"
save "${stata_data}ched_9094e.dta", replace


***********************
*II.APPENDING THE DATA*
***********************

******STANDARDIZING FORMAT******************************************************

use "${stata_data}ched_0708.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_0809.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_0910.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_1011.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_1112.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_0203.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_0102.dta", clear
tostring regioncode, replace
save, replace

use "${stata_data}ched_9798e.dta", clear
tostring six_digitprogdis, replace
tostring pscedgroup, replace
save, replace

use "${stata_data}ched_9697e.dta", clear
tostring regioncode, replace
tostring instcode, replace
tostring six_digitprogdis, replace
tostring pscedgroup, replace
save, replace

use "${stata_data}ched_9495e.dta", clear
tostring regioncode, replace
tostring pscedgroup, replace
save, replace

use "${stata_data}ched_9495g.dta", clear
tostring pscedgroup, replace
save, replace

use "${stata_data}ched_9094e.dta", clear
tostring regioncode, replace
tostring instcode, replace
tostring six_digitprogdis, replace
tostring pscedgroup, replace
save, replace

******APPENDING*****************************************************************

use "${stata_data}ched_0506.dta", clear
append using "${stata_data}ched_0607.dta"
append using "${stata_data}ched_0708.dta"
append using "${stata_data}ched_0809.dta"
append using "${stata_data}ched_0910.dta"
append using "${stata_data}ched_1011.dta"
append using "${stata_data}ched_1112.dta"
append using "${stata_data}ched_1213.dta"
append using "${stata_data}ched_1314.dta"

append using "${stata_data}ched_0405.dta"
append using "${stata_data}ched_0304.dta"
append using "${stata_data}ched_0203.dta"
append using "${stata_data}ched_0102.dta"

append using "${stata_data}ched_9801e.dta"
append using "${stata_data}ched_9801g.dta"
append using "${stata_data}ched_9798e.dta"
append using "${stata_data}ched_9697e.dta"
append using "${stata_data}ched_9697g.dta"
append using "${stata_data}ched_9596e.dta"
append using "${stata_data}ched_9596g.dta"
append using "${stata_data}ched_9495e.dta"
append using "${stata_data}ched_9495g.dta"
append using "${stata_data}ched_9094e.dta"

save "${stata_data}/finalched.dta", replace


*********************************
*III.CLEANING THE DATA*
*********************************

use "${stata_data}/finalched.dta", clear

******Clean the enrollment and graduation numbers*******************************
*This makes sure 0s are in the data and the missings are for when data are 
*unavailable

replace emftotal=0 if emftotal==. & (typedata=="Enrollments" | typedata=="Both" | typedata=="Enrollments (not by gender)")

foreach var of var emtotal eftotal {
	replace `var'=0 if `var'==. & (typedata=="Enrollments" | typedata=="Both")
	}

replace gmftotal=0 if gmftotal==. & (typedata=="Graduations" | typedata=="Both" | typedata=="Graduations (not by gender)")

foreach var of var gftotal gmtotal {
	replace `var'=0 if `var'==. & (typedata=="Graduations" | typedata=="Both")
	}

*This double checks that the sum of genders are the same	
gen sumemftotal=emtotal+eftotal
replace emftotal=sumemftotal if sumemftotal!=emftotal & sumemftotal!=.			//2 replaced here
drop sumemftotal

gen sumgmftotal=gmtotal+gftotal
replace gmftotal=sumgmftotal if sumgmftotal!=gmftotal & sumgmftotal!=.			// 0 replaced here
drop sumgmftotal

save "${stata_data}finalched.dta", replace

merge m:1 regioncode acadyear instcode typedata using "${geo_data}missingprovince.dta" 
replace province=new_province if _m==3 & province==""
drop new_province noprovince _merge

merge m:1 regioncode acadyear instcode typedata using "${geo_data}missinginsttype.dta" 
replace insttype=new_insttype if _m==3 & insttype==""
drop new_insttype _merge

save "${stata_data}finalched_v2.dta", replace
